package au.com.acpfg.io.genbank.reader; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; import org.knime.core.data.DataCell; import org.knime.core.data.DataColumnSpec; import org.knime.core.data.DataColumnSpecCreator; import org.knime.core.data.DataTableSpec; import org.knime.core.data.DataType; import org.knime.core.data.collection.CollectionCellFactory; import org.knime.core.data.collection.ListCell; import org.knime.core.data.def.DefaultRow; import org.knime.core.data.def.StringCell; import org.knime.core.node.BufferedDataContainer; import org.knime.core.node.BufferedDataTable; import org.knime.core.node.CanceledExecutionException; import org.knime.core.node.ExecutionContext; import org.knime.core.node.ExecutionMonitor; import org.knime.core.node.InvalidSettingsException; import org.knime.core.node.NodeLogger; import org.knime.core.node.NodeModel; import org.knime.core.node.NodeSettingsRO; import org.knime.core.node.NodeSettingsWO; import org.knime.core.node.defaultnodesettings.SettingsModelBoolean; import org.knime.core.node.defaultnodesettings.SettingsModelString; import org.knime.core.node.defaultnodesettings.SettingsModelStringArray; /** * Non-biojava based implementation which should be much faster and resilient to poor annotation * The factory class now instantiates this node model rather than the biojava-based one, but changing * the factory should be possible in future. * * @author andrew.cassin * */ public class FastGenbankNodeModel extends NodeModel implements GenbankFeatureListener { // number of columns in first output port private final static int NCOLS_PORT0 = 10; // summary data output port private final static int NCOLS_PORT1 = 10; // source properties eg. /organism private final static int NCOLS_PORT2 = 8; // cds properties eg. proteins thought coded by a given gene // the logger instance private static final NodeLogger logger = NodeLogger .getLogger(FastGenbankNodeModel.class); /** the settings key which is used to retrieve and store the settings (from the dialog or from a settings file) (package visibility to be usable from the dialog). */ static final String CFGKEY_ISFILE = "folder-or-folder?"; static final String CFGKEY_FILE = "filename"; static final String CFGKEY_FOLDER = "foldername"; static final String CFGKEY_TAXONOMY_FILTER ="taxonomy-filter-keywords"; static final String CFGKEY_SOURCE_FEATURES = "output-source-features?"; static final String CFGKEY_CDS_FEATURES = "output-cgs-features?"; static final String CFGKEY_FILENAME_FILTER = "filename-filter-keywords"; private final SettingsModelBoolean m_isfile = new SettingsModelBoolean(CFGKEY_ISFILE, true); private final SettingsModelString m_filename= new SettingsModelString(CFGKEY_FILE, "c:/temp/gb.seq"); private final SettingsModelString m_folder = new SettingsModelString(CFGKEY_FOLDER, "c:/temp"); private final SettingsModelString m_taxonomy_filter = new SettingsModelString(CFGKEY_TAXONOMY_FILTER, "Lolium"); private final SettingsModelBoolean m_source_features = new SettingsModelBoolean(CFGKEY_SOURCE_FEATURES, true); private final SettingsModelBoolean m_cds_features = new SettingsModelBoolean(CFGKEY_CDS_FEATURES, true); private final SettingsModelString m_fname_filter = new SettingsModelString(CFGKEY_FILENAME_FILTER, ""); // HACK: as the GenbankFeatureListener is called before the taxonomy filter is applied, we must save the rows temporarily... yuk! private Vector<DefaultRow> m_container2_rows; private Vector<DefaultRow> m_container3_rows; private static int src_id; private static int cds_id; private final static Pattern feature_match = Pattern.compile("\\s/(\\w+)=\"([^\"]+?)\"\\s*$", Pattern.MULTILINE | Pattern.DOTALL); protected FastGenbankNodeModel() { super(0, 3); m_filename.setEnabled(m_isfile.getBooleanValue()); m_folder.setEnabled(!m_isfile.getBooleanValue()); } @Override protected void loadInternals(File nodeInternDir, ExecutionMonitor exec) throws IOException, CanceledExecutionException { // TODO Auto-generated method stub } @Override protected void saveInternals(File nodeInternDir, ExecutionMonitor exec) throws IOException, CanceledExecutionException { // TODO Auto-generated method stub } /** * Responsible for ensuring a valid DataCell is returned (use a missing cell if <code>str</code> is not valid) * @param str * @return a valid KNIME data cell (NOT guaranteed to be a StringCell) */ protected DataCell safe_cell(String str) { return (str != null) ? new StringCell(str) : DataType.getMissingCell(); } /** * Responsible for ensuring a valid DataCell is returned (use a missing cell if <code>str</code> is not valid) * @param str * @return a valid KNIME data cell (NOT guaranteed to be a StringCell) */ protected DataCell safe_cell(StringBuffer str) { if (str == null) return DataType.getMissingCell(); return safe_cell(str.toString()); } /** * Responsible for ensuring a valid collection cell is returned */ protected DataCell safe_cell(List<StringCell> cells) { if (cells == null || cells.size() < 1) return DataType.getMissingCell(); return CollectionCellFactory.createListCell(cells); } /** * Similar to <code>safe_cell</code> but this guards against a non-existant map as well */ protected DataCell safe_feature(Map<String, String> map, String key) { if (map == null) return DataType.getMissingCell(); String val = map.get(key); if (val == null) return DataType.getMissingCell(); return new StringCell(val); } /** * Returns an inputstream object, depending on whether the file is likely compressed or not * * @param f * @return * @throws IOException */ private InputStream make_input_stream(File f) throws IOException { boolean is_compressed = f.getName().endsWith(".gz"); if (is_compressed) { return new GZIPInputStream(new FileInputStream(f)); } else { return new FileInputStream(f); } } @Override protected DataTableSpec[] configure(final DataTableSpec[] inSpecs) throws InvalidSettingsException { return make_output_cols(); } protected DataTableSpec[] make_output_cols() { DataTableSpec[] out_tables = new DataTableSpec[3]; // the data table spec of the single output table, // the table will have three columns: DataColumnSpec[] allColSpecs = new DataColumnSpec[NCOLS_PORT0]; allColSpecs[0] = new DataColumnSpecCreator("GenBank Locus Name", StringCell.TYPE).createSpec(); allColSpecs[1] = new DataColumnSpecCreator("GenBank Sequence", StringCell.TYPE).createSpec(); allColSpecs[2] = new DataColumnSpecCreator("Filename", StringCell.TYPE).createSpec(); allColSpecs[3] = new DataColumnSpecCreator("Molecule Type", StringCell.TYPE).createSpec(); allColSpecs[4] = new DataColumnSpecCreator("Entry Last Modified Date", StringCell.TYPE).createSpec(); allColSpecs[5] = new DataColumnSpecCreator("Entry Version", StringCell.TYPE).createSpec(); allColSpecs[6] = new DataColumnSpecCreator("Comments", StringCell.TYPE).createSpec(); allColSpecs[7] = new DataColumnSpecCreator("Accession", StringCell.TYPE).createSpec(); allColSpecs[8] = new DataColumnSpecCreator("Definition", StringCell.TYPE).createSpec(); allColSpecs[9] = new DataColumnSpecCreator("NCBI Taxonomy (& lineage)", StringCell.TYPE).createSpec(); out_tables[0] = new DataTableSpec(allColSpecs); DataColumnSpec[] allFeatureColSpecs = new DataColumnSpec[NCOLS_PORT1]; allFeatureColSpecs[0] = new DataColumnSpecCreator("Accession", StringCell.TYPE).createSpec(); allFeatureColSpecs[1] = new DataColumnSpecCreator("Organism", StringCell.TYPE).createSpec(); allFeatureColSpecs[2] = new DataColumnSpecCreator("Molecule Type", StringCell.TYPE).createSpec(); allFeatureColSpecs[3] = new DataColumnSpecCreator("Strain", StringCell.TYPE).createSpec(); allFeatureColSpecs[4] = new DataColumnSpecCreator("Database Xref", StringCell.TYPE).createSpec(); allFeatureColSpecs[5] = new DataColumnSpecCreator("Clone ID", StringCell.TYPE).createSpec(); allFeatureColSpecs[6] = new DataColumnSpecCreator("Tissue Type", StringCell.TYPE).createSpec(); allFeatureColSpecs[7] = new DataColumnSpecCreator("Development Stage", StringCell.TYPE).createSpec(); allFeatureColSpecs[8] = new DataColumnSpecCreator("Clone Library",StringCell.TYPE).createSpec(); allFeatureColSpecs[9] = new DataColumnSpecCreator("Note", StringCell.TYPE).createSpec(); out_tables[1] = new DataTableSpec(allFeatureColSpecs); DataColumnSpec[] allCodingColSpecs = new DataColumnSpec[NCOLS_PORT2]; allCodingColSpecs[0] = new DataColumnSpecCreator("Accession", StringCell.TYPE).createSpec(); allCodingColSpecs[1] = new DataColumnSpecCreator("Gene", StringCell.TYPE).createSpec(); allCodingColSpecs[2] = new DataColumnSpecCreator("Product", StringCell.TYPE).createSpec(); allCodingColSpecs[3] = new DataColumnSpecCreator("Database Xref", StringCell.TYPE).createSpec(); allCodingColSpecs[4] = new DataColumnSpecCreator("Translation", StringCell.TYPE).createSpec(); allCodingColSpecs[5] = new DataColumnSpecCreator("Note", StringCell.TYPE).createSpec(); allCodingColSpecs[6] = new DataColumnSpecCreator("Protein ID", StringCell.TYPE).createSpec(); allCodingColSpecs[7] = new DataColumnSpecCreator("Function", StringCell.TYPE).createSpec(); out_tables[2] = new DataTableSpec(allCodingColSpecs); return out_tables; } @Override protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception { ArrayList<File> files_to_read = new ArrayList<File>(); File[] scan_files; if (m_isfile.getBooleanValue()) { scan_files = new File[] { new File(m_filename.getStringValue()) }; } else { scan_files = new File(m_folder.getStringValue()).listFiles(); } boolean has_filename_filter= (m_fname_filter.getStringValue().trim().length() > 0); String[] filename_keywords = m_fname_filter.getStringValue().split("\\s+"); for (File f : scan_files) { if (!f.isFile() || !f.canRead() || f.length() < 1) { logger.info("Skipping inaccessible file: "+f.getName()); continue; } String fname = f.getName().toLowerCase(); if (has_filename_filter) { boolean found = false; for (String keyword : filename_keywords) { if (fname.indexOf(keyword.toLowerCase()) >= 0) { found = true; break; } } if (!found) { logger.info("Filename "+fname+" does not meet filename filter... ignoring"); continue; } files_to_read.add(f); } else { // no filename filter specified... assume likely genbank files (uncompressed or gzip'ed) if (fname.startsWith("gbest")) { files_to_read.add(f); } else if (fname.endsWith(".gb") || fname.endsWith(".gb.gz") || fname.endsWith(".gbk") || fname.endsWith(".gbk.gz")) { files_to_read.add(f); } else if (fname.endsWith(".seq.gz") || fname.endsWith(".seq")) { files_to_read.add(f); } } } logger.info("GenBank Reader: found "+files_to_read.size()+" plausible GenBank data files to load"); DataTableSpec[] out_tables = make_output_cols(); // the execution context will provide us with storage capacity, in this // case a data container to which we will add rows sequentially // Note, this container can also handle arbitrary big data tables, it // will buffer to disc if necessary. final BufferedDataContainer container = exec.createDataContainer(out_tables[0]); final BufferedDataContainer container2= exec.createDataContainer(out_tables[1]); final BufferedDataContainer container3= exec.createDataContainer(out_tables[2]); int done_files = 0; int hit = 1; src_id = 1; cds_id = 1; // setup the match data structure (int[]) for the taxa String[] taxa = m_taxonomy_filter.getStringValue().split("\\s+"); boolean has_taxa_filter = false; for (String t : taxa) { if (t.length() > 0) { has_taxa_filter = true; break; } } // process the files int failed_files = 0; for (File f : files_to_read) { int cnt = 0; int accepted = 0; // make a new stream rather than use one which has been partially read BufferedReader rdr = null; try { rdr = new BufferedReader(new InputStreamReader(make_input_stream(f))); String line; while ((line = rdr.readLine()) != null) { if (!line.startsWith("LOCUS")) { continue; } StringBuffer rec = new StringBuffer(10*1024); rec.append(line); rec.append('\n'); while ((line = rdr.readLine()) != null) { rec.append(line); rec.append('\n'); if (line.startsWith("//")) { cnt++; break; } } m_container2_rows = null; m_container3_rows = null; GenbankRecord gbr = new GenbankRecord(rec, this); // HACK: for now we only support filtering by organism or lineage... if (has_taxa_filter) { boolean found = false; String lineage = gbr.get_taxonomy(); for (String term : taxa) { if (term.length() < 1) continue; if (lineage.toLowerCase().indexOf(term.toLowerCase()) >= 0) { found = true; break; } } if (!found) continue; } // add the row to the first output port, since it has passed the taxonomy filter (if any) DataCell[] cells = new DataCell[NCOLS_PORT0]; cells[0] = safe_cell(gbr.get_locus_name()); cells[1] = safe_cell(gbr.get_filtered_sequence()); cells[2] = new StringCell(f.getName()); cells[3] = safe_cell(gbr.get_molecule_type()); cells[4] = safe_cell(gbr.get_last_modified()); cells[5] = safe_cell(gbr.get_version()); cells[6] = safe_cell(gbr.get_comment()); cells[7] = safe_cell(gbr.get_accession()); cells[8] = safe_cell(gbr.get_definition()); cells[9] = safe_cell(gbr.get_taxonomy()); accepted++; String row_key = "GB"+hit; container.addRowToTable(new DefaultRow(row_key, cells)); // add the cells to the feature output ports if (m_container2_rows != null) { for (DefaultRow r : m_container2_rows) { container2.addRowToTable(r); } } if (m_container3_rows != null) { for (DefaultRow r : m_container3_rows) { container3.addRowToTable(r); } } hit++; if (hit % 200 == 0) { exec.checkCanceled(); } // TODO... if (line == null) break; } rdr.close(); logger.info("Processed "+cnt+" genbank entries (accepted "+accepted+") in "+f.getName()); } catch (Exception e) { if (rdr != null) rdr.close(); failed_files++; logger.warn("Error in genbank record in "+f.getName()+" error msg is: "); logger.warn(e.getMessage()); e.printStackTrace(); } done_files++; exec.checkCanceled(); exec.setProgress(((double) done_files) / files_to_read.size()); } logger.info("Processed "+done_files+" files ("+failed_files+" contained errors). Loading complete."); // once we are done, we close the container and return its table container.close(); container2.close(); container3.close(); BufferedDataTable out = container.getTable(); BufferedDataTable out2= container2.getTable(); BufferedDataTable out3= container3.getTable(); return new BufferedDataTable[]{out, out2, out3}; } /** * {@inheritDoc} */ @Override protected void saveSettingsTo(final NodeSettingsWO settings) { m_isfile.saveSettingsTo(settings); m_folder.saveSettingsTo(settings); m_filename.saveSettingsTo(settings); m_taxonomy_filter.saveSettingsTo(settings); m_source_features.saveSettingsTo(settings); m_cds_features.saveSettingsTo(settings); m_fname_filter.saveSettingsTo(settings); } /** * {@inheritDoc} */ @Override protected void loadValidatedSettingsFrom(final NodeSettingsRO settings) throws InvalidSettingsException { m_isfile.loadSettingsFrom(settings); m_folder.loadSettingsFrom(settings); m_filename.loadSettingsFrom(settings); m_taxonomy_filter.loadSettingsFrom(settings); m_source_features.loadSettingsFrom(settings); m_cds_features.loadSettingsFrom(settings); m_fname_filter.loadSettingsFrom(settings); } /** * {@inheritDoc} */ @Override protected void validateSettings(final NodeSettingsRO settings) throws InvalidSettingsException { m_isfile.validateSettings(settings); m_folder.validateSettings(settings); m_filename.validateSettings(settings); m_taxonomy_filter.validateSettings(settings); m_source_features.validateSettings(settings); m_cds_features.validateSettings(settings); m_fname_filter.validateSettings(settings); } @Override protected void reset() { // TODO Auto-generated method stub } @Override public void parse_section(String title, String accsn, String content) throws InvalidGenbankRecordException { if (title.equals("source") && m_source_features.getBooleanValue()) { if (m_container2_rows == null) m_container2_rows = new Vector<DefaultRow>(); HashMap<String,String> feature_properties = new HashMap<String,String>(); //TODO... do something with feature position? Matcher m = feature_match.matcher(content); while (m.find()) { feature_properties.put(m.group(1).toLowerCase(), m.group(2)); } DataCell[] cells = new DataCell[NCOLS_PORT1]; cells[0] = safe_cell(accsn); cells[1] = safe_feature(feature_properties, "organism"); cells[2] = safe_feature(feature_properties, "mol_type"); cells[3] = safe_feature(feature_properties, "strain"); cells[4] = safe_feature(feature_properties, "db_xref"); cells[5] = safe_feature(feature_properties, "clone"); cells[6] = safe_feature(feature_properties, "tissue_type"); cells[7] = safe_feature(feature_properties, "dev_stage"); cells[8] = safe_feature(feature_properties, "clone_lib"); cells[9] = safe_feature(feature_properties, "note"); m_container2_rows.add(new DefaultRow("S"+src_id++, cells)); } else if (title.equals("cds") && m_cds_features.getBooleanValue()) { if (m_container3_rows == null) m_container3_rows = new Vector<DefaultRow>(); HashMap<String,String> feature_properties = new HashMap<String,String>(); Matcher m = feature_match.matcher(content); while (m.find()) { feature_properties.put(m.group(1).toLowerCase(), m.group(2)); } DataCell[] cells = new DataCell[NCOLS_PORT2]; cells[0] = safe_cell(accsn); cells[1] = safe_feature(feature_properties, "gene"); cells[2] = safe_feature(feature_properties, "product"); cells[3] = safe_feature(feature_properties, "db_xref"); cells[4] = safe_feature(feature_properties, "translation"); cells[5] = safe_feature(feature_properties, "note"); cells[6] = safe_feature(feature_properties, "protein_id"); cells[7] = safe_feature(feature_properties, "function"); m_container3_rows.add(new DefaultRow("CDS"+cds_id++, cells)); } } }